library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
library(rtweet)
library(maps)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(syuzhet)
##
## Attaching package: 'syuzhet'
## The following object is masked from 'package:rtweet':
##
## get_tokens
library(reactable)
library(wordcloud2)
library(stringr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(tidytext)
library(topicmodels)
BigramTokenizer <- function(x,n) unlist(lapply(ngrams(words(x), n), paste, collapse = " "), use.names = FALSE)
clean_tweets <- function(x) {
x %>%
str_remove_all(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)") %>%
str_replace_all("&", "and") %>%
str_remove_all("[[:punct:]]") %>%
str_remove_all("^RT:? ") %>%
str_remove_all("@[[:alnum:]]+") %>%
str_remove_all("#[[:alnum:]]+") %>%
str_replace_all("\\\n", " ") %>%
str_to_lower() %>%
str_trim("both")
}
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
removeUsername <- function(x) gsub("@[^[:space:]]*", "", x)
removeSingle <- function(x) gsub(" . ", " ", x)
mystopwords <- c((stopwords('english')),c("https", "t.co", "it's")) #or read in your stop word list
path<-"C:/Users/HP/OneDrive/Documents/Twitter_sentimental_analysis" #Set your own path if using.
setwd(path)
df<-read.csv("C:/Users/HP/OneDrive/Documents/Twitter_sentimental_analysis/BidenJuly21SearchTweets.csv") #allows you to navigate to file location
dfCopy <- df # used to do the ngrams, need the context
reactable(df, searchable = T, filterable=T) #nice way to review the data, searchable and filterable
### To isolate only Tweets set to filter out the following
df <- df[df$is_retweet == F,] #keep tweets that have not been retweeted
df <- df[is.na(df$reply_to_status_id), ] #keep tweets left not replied to
### Start cleaning the text column, some double is fine.
df$text<- clean_tweets(df$text) #the function seems to work well, there is double up.
df$text<- removeWords(df$text, mystopwords) #can remove stopwords using this call also
dim(df)
## [1] 551 90
### create the Corpus or VCorpus.
w <- VCorpus(VectorSource(df$text))
w <- tm_map(w, content_transformer(removeNumPunct))
w <- tm_map(w, removeNumbers)
w <- tm_map(w, content_transformer(removeURL))
w <- tm_map(w, removePunctuation)
w <- tm_map(w, content_transformer(tolower))
w <- tm_map(w, removeWords, mystopwords)
w <- tm_map(w, stripWhitespace)
w <- tm_map(w, content_transformer(removeUsername))
w <- tm_map(w, content_transformer(removeSingle))
### Topic modelling
tdm <- TermDocumentMatrix(w)
dtm <- DocumentTermMatrix(w)
# Processing using standard packages and methods
frequency <- findFreqTerms(tdm, lowfreq=15)
frequency
## [1] "america" "american" "americans" "back"
## [5] "biden" "black" "can" "cant"
## [9] "cnn" "come" "country" "day"
## [13] "democratic" "dnc" "doesnt" "dont"
## [17] "election" "even" "foxnews" "get"
## [21] "going" "good" "gop" "hes"
## [25] "hillaryclinton" "joe" "joebiden" "joyannreid"
## [29] "just" "kanyewest" "know" "like"
## [33] "maga" "make" "msnbc" "muslim"
## [37] "need" "never" "now" "one"
## [41] "people" "please" "potus" "president"
## [45] "realdonaldtrump" "really" "running" "schools"
## [49] "see" "speakerpelosi" "stop" "support"
## [53] "thedemocrats" "think" "time" "trump"
## [57] "vote" "voting" "want" "white"
## [61] "will" "youre"
freq <- rowSums(as.matrix(tdm))
limit <- 5
freq <- subset(freq, freq >= limit)
dfreq <- data.frame(term = names(freq), freq = freq)
spoint <- 20
ggplot(dfreq[dfreq$freq>spoint,], aes(x=term, y=freq)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count") +coord_flip()
library("RGraphics")
## Loading required package: grid
m <- as.matrix(tdm)
word.freq <- sort(rowSums(m), decreasing = T)
words <- as.data.frame(word.freq)
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 3,random.order = F)
words$word <- rownames(words) #new col = rownames
words<-words[c(2,1)] #interchance the cols
names(words)[1]<-"word" #change the col names
names(words)[2]<-"freq" #phew - must be an easier tricky way!
wordcloud2(words[words[,2]>3,], size=3, color='random-dark')
sentText <- get_nrc_sentiment(words$word)
a<-as.data.frame(sort(colSums(sentText)))
barplot(a[,1], names=row.names(a), las=2)
tdm2 <- removeSparseTerms(tdm, sparse = 0.95)
m2 <- as.matrix(tdm2)
distMatrix <- dist(scale(m2)) #note need to scale central mean
fit <- hclust(distMatrix, method = "ward.D")
plot(fit)
rect.hclust(fit, k = 6) # draw 6 groupings
### Topic Modelling
rowTotals <- apply(dtm , 1, sum)
dtm2 <- dtm[rowTotals> 0, ] #leave out 0 rows
lda <- LDA(dtm2, k = 6) # find n topics
term <- terms(lda, 4) # first 4 terms of every topic
term
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "joebiden" "joebiden" "joebiden" "joebiden" "joebiden"
## [2,] "realdonaldtrump" "potus" "will" "realdonaldtrump" "one"
## [3,] "will" "president" "biden" "trump" "biden"
## [4,] "black" "joe" "america" "cnn" "muslim"
## Topic 6
## [1,] "joebiden"
## [2,] "will"
## [3,] "realdonaldtrump"
## [4,] "want"
text<- clean_tweets(dfCopy$text)
text<- removeWords(text, mystopwords)
ngram <- 3 #set size of the word group
ngList = BigramTokenizer(text, ngram) # get the set of 3 word groups
x <- as.data.frame(sort(table(ngList),decreasing=T)) #use table to get the counts, set as a df
x$ngList<-as.character(x$ngList) #make sure not blessed factors
head(x, 10)
## ngList Freq
## 1 hkrassenstein realdonaldtrump joebiden 118
## 2 joebiden joyannreid msnbc 96
## 3 howardfineman joebiden joyannreid 92
## 4 joyannreid msnbc ewarren 92
## 5 realdonaldtrump joebiden potus 78
## 6 joebiden potus dnc 73
## 7 dumpoperation hkrassenstein realdonaldtrump 51
## 8 4aof newjeffct nhojhpesoj 35
## 9 bamableu teebeedee1 smackeycracks 35
## 10 boomer818 elenasfca mcatnip 35
wordcloud2(x[x$Freq>1,], size=0.5, color='random-dark')
df1<-read.csv("C:/Users/HP/OneDrive/Documents/Twitter_sentimental_analysis/TrumpJuly21SearchTweets.csv") #allows you to navigate to file location
dfCopy1 <- df1
reactable(df1, searchable = T, filterable=T) #nice way to review the data, searchable and filterable
### To isolate only Tweets set to filter out the following
df1 <- df1[df1$is_retweet == F,] #keep tweets that have not been retweeted
df1 <- df1[is.na(df1$reply_to_status_id), ] #keep tweets left not replied to
dim(df1)
## [1] 648 90
### Start cleaning the text column, some double is fine.
df1$text<- clean_tweets(df1$text) #the function seems to work well, there is double up.
df1$text<- removeWords(df1$text, mystopwords) #can remove stopwords using this call also
### create the Corpus or VCorpus
w1 <- VCorpus(VectorSource(df1$text))
w1<- tm_map(w1, content_transformer(removeNumPunct))
w1 <- tm_map(w1, removeNumbers)
w1 <- tm_map(w1, content_transformer(removeURL))
w1 <- tm_map(w1, removePunctuation)
w1 <- tm_map(w1, content_transformer(tolower))
w1 <- tm_map(w1, removeWords, mystopwords)
w1 <- tm_map(w1, stripWhitespace)
w1 <- tm_map(w1, content_transformer(removeUsername))
w1 <- tm_map(w1, content_transformer(removeSingle))
tdm1 <- TermDocumentMatrix(w1)
dtm1 <- DocumentTermMatrix(w1) #used in topic modelling
frequency1 <- findFreqTerms(tdm1, lowfreq=15)
freq1 <- rowSums(as.matrix(tdm1))
limit <- 5
freq1 <- subset(freq1, freq1 >= limit)
dfreq1 <- data.frame(term = names(freq1), freq1 = freq1)
spoint <- 20
ggplot(dfreq1[dfreq1$freq1>spoint,], aes(x=term, y=freq1)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count")
### Look at correlated word associations
corLimit1 <- 0.25
term1<- "trump"
findAssocs(tdm1, term1, corLimit1)
## $trump
## mail ballot kentucky florida
## 0.35 0.33 0.33 0.31
term2<-"perfect"
findAssocs(tdm1, term2, corLimit1)
## $perfect
## numeric(0)
term3<-"obama"
findAssocs(tdm1, term3, corLimit1)
## $obama
## autocrat capita ebola fascists projecting radicals recuses
## 0.71 0.71 0.71 0.71 0.71 0.71 0.71
## regard spies turns unfit don calls conspiracy
## 0.71 0.71 0.71 0.71 0.58 0.50 0.50
## flu leading per corrupt epstein instead cases
## 0.50 0.50 0.50 0.44 0.41 0.41 0.35
## etc leadership world joke dems friends like
## 0.35 0.35 0.35 0.31 0.29 0.29 0.29
## protesters makes qanon
## 0.29 0.26 0.26
### 3d Plot wordclouds and barplot of word frequency sorted
m1 <- as.matrix(tdm1)
word.freq1 <- sort(rowSums(m1), decreasing = T)
words1 <- as.data.frame(word.freq1)
wordcloud(words = names(word.freq1), freq = word.freq1, min.freq = 3,random.order = F)
# to use wordcloud2 need word, freq and that order
words1$word <- rownames(words1) #new col = rownames
words1<-words1[c(2,1)] #interchance the cols
names(words1)[1]<-"word" #change the col names
names(words1)[2]<-"freq" #phew - must be an easier tricky way!
wordcloud2(words1[words1[,2]>3,], size=3, color='random-dark')
### simple sentiment barplot of range of emotions
sentText1 <- get_nrc_sentiment(words1$word)
a1<-as.data.frame(sort(colSums(sentText1)))
barplot(a1[,1], names=row.names(a1), las=2)
### Cluster Plot
tdm2 <- removeSparseTerms(tdm, sparse = 0.95)
m2 <- as.matrix(tdm2)
distMatrix2 <- dist(scale(m2)) #note need to scale central mean
fit2 <- hclust(distMatrix2, method = "ward.D")
plot(fit2)
rect.hclust(fit2, k = 6) # draw 6 groupings
### 3g Topic Modelling
rowTotals2 <- apply(dtm1 , 1, sum)
dtm1 <- dtm1[rowTotals2> 0, ] #leave out 0 rows
lda1 <- LDA(dtm1, k = 6) # find n topics
term1 <- terms(lda1, 4) # first 4 terms of every topic
term1
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "realdonaldtrump" "realdonaldtrump" "realdonaldtrump" "realdonaldtrump"
## [2,] "trump" "will" "portland" "loser"
## [3,] "covid" "trump" "will" "president"
## [4,] "know" "can" "right" "will"
## Topic 5 Topic 6
## [1,] "realdonaldtrump" "realdonaldtrump"
## [2,] "mask" "portland"
## [3,] "will" "people"
## [4,] "now" "whitehouse"
### 3i look at N-grams, in particular 3n
text1<- clean_tweets(dfCopy1$text1)
text1<- removeWords(text1, mystopwords)
ngram <- 3 #set size of the word group
ngList1 = BigramTokenizer(text1, ngram) # get the set of 3 word groups
x1 <- as.data.frame(sort(table(ngList1),decreasing=T)) #use table to get the counts, set as a df
x1$ngList1<-as.character(x1$ngList1) #make sure not blessed factors
head(x1, 10)
## [1] sort(table(ngList1), decreasing = T) ngList1
## <0 rows> (or 0-length row.names)
### Plot wordcloud
wordcloud2(x[x$Freq>1,], size=0.5, color='random-dark') #single most informative visualisation!
In recent years, the application of opinion mining for sentiment analysis has gained momentum that concentrates on identification and interpretation of emotions, public opinions regarding a desired subject or object based on textual data. In this we have done, sentiment analysis has been performed on tweets retrieved from the US Election 2020. The polarity of the review is judged based on the sentiment expression. The fundamental tasks involved in opinion mining including extraction of data, clustering of extracted data as well its classification.
Here we have collected tweets data of Joe Biden and Donald Trump. For this, we used following keywords: “JoeBiden”, “DonaldTrump”, “BidenHarris”, “US Election 2020”, “TrumpPence”. We collected around 1.25M tweets for the period of October 2019 to July 2020. Figure 1 shows our proposed system workflow.